EAS503 Final Project - Varun Singh#
Links to Project:#
Import Statements
import sqlite3
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from ydata_profiling import ProfileReport
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")
import mlflow
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow.sklearn
Normalisation Script
'''
The code normalises the data and creates a normalised database through functions that create a table.
'''
class DataNormalizer:
def __init__(self, csv_file, db_file):
self.csv_file = csv_file
self.db_file = db_file
os.makedirs(os.path.dirname(self.db_file), exist_ok=True)
self.conn = sqlite3.connect(self.db_file)
self.cursor = self.conn.cursor()
self.cursor.execute("PRAGMA foreign_keys = ON;")
self.conn.commit()
def create_patients_table(self):
self.cursor.execute("DROP TABLE IF EXISTS Patients")
self.cursor.execute('''
CREATE TABLE Patients (
PatientID INTEGER PRIMARY KEY AUTOINCREMENT,
State TEXT,
Sex TEXT,
AgeCategory TEXT,
RaceEthnicityCategory TEXT,
HeightInMeters REAL,
WeightInKilograms REAL,
BMI REAL
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for line in file:
row = line.strip().split(',')
data.append((row[0], row[1], row[29], row[28], row[30], row[31], row[32]))
self.cursor.executemany('''
INSERT INTO Patients (State, Sex, AgeCategory, RaceEthnicityCategory, HeightInMeters, WeightInKilograms, BMI)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_health_status_table(self):
self.cursor.execute("DROP TABLE IF EXISTS HealthStatus")
self.cursor.execute('''
CREATE TABLE HealthStatus (
HealthStatusID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
GeneralHealth TEXT,
PhysicalHealthDays INTEGER,
MentalHealthDays INTEGER,
LastCheckupTime TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[2], row[3], row[4], row[5]))
self.cursor.executemany('''
INSERT INTO HealthStatus (PatientID, GeneralHealth, PhysicalHealthDays, MentalHealthDays, LastCheckupTime)
VALUES (?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_lifestyle_table(self):
self.cursor.execute("DROP TABLE IF EXISTS Lifestyle")
self.cursor.execute('''
CREATE TABLE Lifestyle (
LifestyleID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
PhysicalActivities TEXT,
SleepHours INTEGER,
AlcoholDrinkers TEXT,
SmokerStatus TEXT,
ECigaretteUsage TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[6], row[7], row[33], row[25], row[26]))
self.cursor.executemany('''
INSERT INTO Lifestyle (PatientID, PhysicalActivities, SleepHours, AlcoholDrinkers, SmokerStatus, ECigaretteUsage)
VALUES (?, ?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_medical_history_table(self):
self.cursor.execute("DROP TABLE IF EXISTS MedicalHistory")
self.cursor.execute('''
CREATE TABLE MedicalHistory (
MedicalHistoryID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
RemovedTeeth TEXT,
HadHeartAttack TEXT,
HadAngina TEXT,
HadStroke TEXT,
HadAsthma TEXT,
HadSkinCancer TEXT,
HadCOPD TEXT,
HadDepressiveDisorder TEXT,
HadKidneyDisease TEXT,
HadArthritis TEXT,
HadDiabetes TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16], row[17], row[18]))
self.cursor.executemany('''
INSERT INTO MedicalHistory (PatientID, RemovedTeeth, HadHeartAttack, HadAngina, HadStroke, HadAsthma, HadSkinCancer, HadCOPD, HadDepressiveDisorder, HadKidneyDisease, HadArthritis, HadDiabetes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_disabilities_table(self):
self.cursor.execute("DROP TABLE IF EXISTS Disabilities")
self.cursor.execute('''
CREATE TABLE Disabilities (
DisabilityID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
DeafOrHardOfHearing TEXT,
BlindOrVisionDifficulty TEXT,
DifficultyConcentrating TEXT,
DifficultyWalking TEXT,
DifficultyDressingBathing TEXT,
DifficultyErrands TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[19], row[20], row[21], row[22], row[23], row[24]))
self.cursor.executemany('''
INSERT INTO Disabilities (PatientID, DeafOrHardOfHearing, BlindOrVisionDifficulty, DifficultyConcentrating, DifficultyWalking, DifficultyDressingBathing, DifficultyErrands)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_preventive_care_table(self):
self.cursor.execute("DROP TABLE IF EXISTS PreventiveCare")
self.cursor.execute('''
CREATE TABLE PreventiveCare (
PreventiveCareID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
ChestScan TEXT,
HIVTesting TEXT,
FluVaxLast12 TEXT,
PneumoVaxEver TEXT,
TetanusLast10Tdap TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[27], row[34], row[35], row[36], row[37]))
self.cursor.executemany('''
INSERT INTO PreventiveCare (PatientID, ChestScan, HIVTesting, FluVaxLast12, PneumoVaxEver, TetanusLast10Tdap)
VALUES (?, ?, ?, ?, ?, ?)
''', data)
self.conn.commit()
def create_risk_factors_table(self):
self.cursor.execute("DROP TABLE IF EXISTS RiskFactors")
self.cursor.execute('''
CREATE TABLE RiskFactors (
RiskFactorID INTEGER PRIMARY KEY AUTOINCREMENT,
PatientID INTEGER,
HighRiskLastYear TEXT,
CovidPos TEXT,
FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
)''')
data = []
with open(self.csv_file, 'r') as file:
next(file)
for idx, line in enumerate(file, 1):
row = line.strip().split(',')
data.append((idx, row[38], row[39]))
self.cursor.executemany('''
INSERT INTO RiskFactors (PatientID, HighRiskLastYear, CovidPos)
VALUES (?, ?, ?)
''', data)
self.conn.commit()
def close_connection(self):
self.conn.close()
def run(self):
try:
print(f"Creating database: {self.db_file}")
print("Creating Patients table...")
self.create_patients_table()
print("Creating HealthStatus table...")
self.create_health_status_table()
print("Creating Lifestyle table...")
self.create_lifestyle_table()
print("Creating MedicalHistory table...")
self.create_medical_history_table()
print("Creating Disabilities table...")
self.create_disabilities_table()
print("Creating PreventiveCare table...")
self.create_preventive_care_table()
print("Creating RiskFactors table...")
self.create_risk_factors_table()
print("All tables created successfully")
except sqlite3.Error as e:
print(f"An error occurred: {e}")
finally:
self.close_connection()
csv_file = r"C:\Users\singh\Downloads\503 Project\heart_2022_with_nans.csv"
if not os.path.exists(csv_file):
print(f"CSV file not found: {csv_file}")
exit(1)
db_file = r"C:\Users\singh\Downloads\503 Project\normalized2.db"
if os.path.exists(db_file):
os.remove(db_file)
print(f"Existing database file deleted: {db_file}")
normalizer = DataNormalizer(csv_file, db_file)
normalizer.run()
Existing database file deleted: C:\Users\singh\Downloads\503 Project\normalized2.db
Creating database: C:\Users\singh\Downloads\503 Project\normalized2.db
Creating Patients table...
Creating HealthStatus table...
Creating Lifestyle table...
Creating MedicalHistory table...
Creating Disabilities table...
Creating PreventiveCare table...
Creating RiskFactors table...
All tables created successfully
Dataframe Merging Script
'''
Joins all tables in normalised db to create a dataframe for EDA and Pre-processing
'''
def join_all_tables(db_file):
conn = sqlite3.connect(db_file)
query = """
SELECT
Patients.PatientID,
Patients.State,
Patients.Sex,
Patients.AgeCategory,
Patients.RaceEthnicityCategory,
Patients.HeightInMeters,
Patients.WeightInKilograms,
Patients.BMI,
HealthStatus.GeneralHealth,
HealthStatus.PhysicalHealthDays,
HealthStatus.MentalHealthDays,
HealthStatus.LastCheckupTime,
Lifestyle.PhysicalActivities,
Lifestyle.SleepHours,
Lifestyle.AlcoholDrinkers,
Lifestyle.SmokerStatus,
Lifestyle.ECigaretteUsage,
MedicalHistory.RemovedTeeth,
MedicalHistory.HadHeartAttack,
MedicalHistory.HadAngina,
MedicalHistory.HadStroke,
MedicalHistory.HadAsthma,
MedicalHistory.HadSkinCancer,
MedicalHistory.HadCOPD,
MedicalHistory.HadDepressiveDisorder,
MedicalHistory.HadKidneyDisease,
MedicalHistory.HadArthritis,
MedicalHistory.HadDiabetes,
Disabilities.DeafOrHardOfHearing,
Disabilities.BlindOrVisionDifficulty,
Disabilities.DifficultyConcentrating,
Disabilities.DifficultyWalking,
Disabilities.DifficultyDressingBathing,
Disabilities.DifficultyErrands,
PreventiveCare.ChestScan,
PreventiveCare.HIVTesting,
PreventiveCare.FluVaxLast12,
PreventiveCare.PneumoVaxEver,
PreventiveCare.TetanusLast10Tdap,
RiskFactors.HighRiskLastYear,
RiskFactors.CovidPos
FROM Patients
JOIN HealthStatus ON Patients.PatientID = HealthStatus.PatientID
JOIN Lifestyle ON Patients.PatientID = Lifestyle.PatientID
JOIN MedicalHistory ON Patients.PatientID = MedicalHistory.PatientID
JOIN Disabilities ON Patients.PatientID = Disabilities.PatientID
JOIN PreventiveCare ON Patients.PatientID = PreventiveCare.PatientID
JOIN RiskFactors ON Patients.PatientID = RiskFactors.PatientID
"""
df = pd.read_sql_query(query, conn)
conn.close()
return df
db_file = r"C:\Users\singh\Downloads\503 Project\normalized2.db"
joined_df = join_all_tables(db_file)
joined_df.to_csv(r"C:\Users\singh\Downloads\503 Project\joined.csv", index=False)
print("Joined data saved to 'joined_patient_data.csv'")
Joined data saved to 'joined_patient_data.csv'
Data Profiling and EDA
'''
Copying the dataframe, and creating the response variable as a sum of three Heart Disease related variables, as well as removing some non-related variables
'''
df = joined_df.copy()
df["HadHeartIssues"] = df[["HadHeartAttack", "HadAngina", "HadStroke"]].apply(
lambda row: "Yes" if "Yes" in row.values else "No", axis=1)
df.drop(columns=["HadHeartAttack", "HadAngina", "HadStroke", "RemovedTeeth", "TetanusLast10Tdap", "State", "PatientID"], inplace=True)
profile = ProfileReport(df, title="Pandas Profiling Report")
profile
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[4], line 9
7 df.drop(columns=["HadHeartAttack", "HadAngina", "HadStroke", "RemovedTeeth", "TetanusLast10Tdap", "State", "PatientID"], inplace=True)
8 profile = ProfileReport(df, title="Pandas Profiling Report")
----> 9 profile
File ~\projects\Lib\site-packages\IPython\core\displayhook.py:268, in DisplayHook.__call__(self, result)
266 self.start_displayhook()
267 self.write_output_prompt()
--> 268 format_dict, md_dict = self.compute_format_data(result)
269 self.update_user_ns(result)
270 self.fill_exec_result(result)
File ~\projects\Lib\site-packages\IPython\core\displayhook.py:157, in DisplayHook.compute_format_data(self, result)
127 def compute_format_data(self, result):
128 """Compute format data of the object to be displayed.
129
130 The format data is a generalization of the :func:`repr` of an object.
(...)
155
156 """
--> 157 return self.shell.display_formatter.format(result)
File ~\projects\Lib\site-packages\IPython\core\formatters.py:238, in DisplayFormatter.format(self, obj, include, exclude)
236 md = None
237 try:
--> 238 data = formatter(obj)
239 except:
240 # FIXME: log the exception
241 raise
File ~\projects\Lib\site-packages\decorator.py:232, in decorate.<locals>.fun(*args, **kw)
230 if not kwsyntax:
231 args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)
File ~\projects\Lib\site-packages\IPython\core\formatters.py:282, in catch_format_error(method, self, *args, **kwargs)
280 """show traceback on failed format call"""
281 try:
--> 282 r = method(self, *args, **kwargs)
283 except NotImplementedError:
284 # don't warn on NotImplementedErrors
285 return self._check_return(None, args[0])
File ~\projects\Lib\site-packages\IPython\core\formatters.py:406, in BaseFormatter.__call__(self, obj)
404 method = get_real_method(obj, self.print_method)
405 if method is not None:
--> 406 return method()
407 return None
408 else:
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:541, in ProfileReport._repr_html_(self)
539 def _repr_html_(self) -> None:
540 """The ipython notebook widgets user interface gets called by the jupyter notebook."""
--> 541 self.to_notebook_iframe()
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:521, in ProfileReport.to_notebook_iframe(self)
519 with warnings.catch_warnings():
520 warnings.simplefilter("ignore")
--> 521 display(get_notebook_iframe(self.config, self))
File ~\projects\Lib\site-packages\ydata_profiling\report\presentation\flavours\widget\notebook.py:75, in get_notebook_iframe(config, profile)
73 output = get_notebook_iframe_src(config, profile)
74 elif attribute == IframeAttribute.srcdoc:
---> 75 output = get_notebook_iframe_srcdoc(config, profile)
76 else:
77 raise ValueError(
78 f'Iframe Attribute can be "src" or "srcdoc" (current: {attribute}).'
79 )
File ~\projects\Lib\site-packages\ydata_profiling\report\presentation\flavours\widget\notebook.py:29, in get_notebook_iframe_srcdoc(config, profile)
27 width = config.notebook.iframe.width
28 height = config.notebook.iframe.height
---> 29 src = html.escape(profile.to_html())
31 iframe = f'<iframe width="{width}" height="{height}" srcdoc="{src}" frameborder="0" allowfullscreen></iframe>'
33 return HTML(iframe)
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:491, in ProfileReport.to_html(self)
483 def to_html(self) -> str:
484 """Generate and return complete template as lengthy string
485 for using with frameworks.
486
(...)
489
490 """
--> 491 return self.html
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:287, in ProfileReport.html(self)
284 @property
285 def html(self) -> str:
286 if self._html is None:
--> 287 self._html = self._render_html()
288 return self._html
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:404, in ProfileReport._render_html(self)
401 def _render_html(self) -> str:
402 from ydata_profiling.report.presentation.flavours import HTMLReport
--> 404 report = self.report
406 with tqdm(
407 total=1, desc="Render HTML", disable=not self.config.progress_bar
408 ) as pbar:
409 html = HTMLReport(copy.deepcopy(report)).render(
410 nav=self.config.html.navbar_show,
411 offline=self.config.html.use_local_assets,
(...)
419 version=self.description_set.package["ydata_profiling_version"],
420 )
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:281, in ProfileReport.report(self)
278 @property
279 def report(self) -> Root:
280 if self._report is None:
--> 281 self._report = get_report_structure(self.config, self.description_set)
282 return self._report
File ~\projects\Lib\site-packages\ydata_profiling\profile_report.py:263, in ProfileReport.description_set(self)
260 @property
261 def description_set(self) -> BaseDescription:
262 if self._description_set is None:
--> 263 self._description_set = describe_df(
264 self.config,
265 self.df,
266 self.summarizer,
267 self.typeset,
268 self._sample,
269 )
270 return self._description_set
File ~\projects\Lib\site-packages\ydata_profiling\model\describe.py:107, in describe(config, df, summarizer, typeset, sample)
103 correlation_names = get_active_correlations(config)
104 pbar.total += len(correlation_names)
106 correlations = {
--> 107 correlation_name: progress(
108 calculate_correlation,
109 pbar,
110 f"Calculate {correlation_name} correlation",
111 )(config, df, correlation_name, series_description)
112 for correlation_name in correlation_names
113 }
115 # make sure correlations is not None
116 correlations = {
117 key: value for key, value in correlations.items() if value is not None
118 }
File ~\projects\Lib\site-packages\ydata_profiling\utils\progress_bar.py:11, in progress.<locals>.inner(*args, **kwargs)
8 @wraps(fn)
9 def inner(*args, **kwargs) -> Any:
10 bar.set_postfix_str(message)
---> 11 ret = fn(*args, **kwargs)
12 bar.update()
13 return ret
File ~\projects\Lib\site-packages\ydata_profiling\model\correlations.py:102, in calculate_correlation(config, df, correlation_name, summary)
100 correlation = None
101 try:
--> 102 correlation = correlation_measures[correlation_name].compute(
103 config, df, summary
104 )
105 except (ValueError, AssertionError, TypeError, DataError, IndexError) as e:
106 warn_correlation(correlation_name, str(e))
File ~\projects\Lib\site-packages\multimethod\__init__.py:375, in multimethod.__call__(self, *args, **kwargs)
373 func = self.dispatch(*args)
374 try:
--> 375 return func(*args, **kwargs)
376 except TypeError as ex:
377 raise DispatchError(f"Function {func.__code__}") from ex
File ~\projects\Lib\site-packages\ydata_profiling\model\pandas\correlations_pandas.py:213, in pandas_auto_compute(config, df, summary)
206 def f(col_name: str, method: Callable) -> pd.Series:
207 return (
208 df_discretized
209 if col_name in numerical_columns and method is _pairwise_cramers
210 else df
211 )
--> 213 score = method(
214 f(col_1_name, method)[col_1_name], f(col_2_name, method)[col_2_name]
215 )
216 (
217 correlation_matrix.loc[col_1_name, col_2_name],
218 correlation_matrix.loc[col_2_name, col_1_name],
219 ) = (score, score)
221 return correlation_matrix
File ~\projects\Lib\site-packages\ydata_profiling\model\pandas\correlations_pandas.py:87, in _pairwise_cramers(col_1, col_2)
86 def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
---> 87 return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
File ~\projects\Lib\site-packages\pandas\core\reshape\pivot.py:728, in crosstab(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)
724 kwargs = {"aggfunc": aggfunc}
726 # error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type
727 # "**Dict[str, object]"; expected "Union[...]"
--> 728 table = df.pivot_table(
729 "__dummy__",
730 index=unique_rownames,
731 columns=unique_colnames,
732 margins=margins,
733 margins_name=margins_name,
734 dropna=dropna,
735 observed=False,
736 **kwargs, # type: ignore[arg-type]
737 )
739 # Post-process
740 if normalize is not False:
File ~\projects\Lib\site-packages\pandas\core\frame.py:9509, in DataFrame.pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
9492 @Substitution("")
9493 @Appender(_shared_docs["pivot_table"])
9494 def pivot_table(
(...)
9505 sort: bool = True,
9506 ) -> DataFrame:
9507 from pandas.core.reshape.pivot import pivot_table
-> 9509 return pivot_table(
9510 self,
9511 values=values,
9512 index=index,
9513 columns=columns,
9514 aggfunc=aggfunc,
9515 fill_value=fill_value,
9516 margins=margins,
9517 dropna=dropna,
9518 margins_name=margins_name,
9519 observed=observed,
9520 sort=sort,
9521 )
File ~\projects\Lib\site-packages\pandas\core\reshape\pivot.py:102, in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
99 table = concat(pieces, keys=keys, axis=1)
100 return table.__finalize__(data, method="pivot_table")
--> 102 table = __internal_pivot_table(
103 data,
104 values,
105 index,
106 columns,
107 aggfunc,
108 fill_value,
109 margins,
110 dropna,
111 margins_name,
112 observed,
113 sort,
114 )
115 return table.__finalize__(data, method="pivot_table")
File ~\projects\Lib\site-packages\pandas\core\reshape\pivot.py:183, in __internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
173 if observed is lib.no_default and any(
174 ping._passed_categorical for ping in grouped._grouper.groupings
175 ):
176 warnings.warn(
177 "The default value of observed=False is deprecated and will change "
178 "to observed=True in a future version of pandas. Specify "
(...)
181 stacklevel=find_stack_level(),
182 )
--> 183 agged = grouped.agg(aggfunc)
185 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
186 agged = agged.dropna(how="all")
File ~\projects\Lib\site-packages\pandas\core\groupby\generic.py:1466, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
1463 # grouper specific aggregations
1464 if self._grouper.nkeys > 1:
1465 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
-> 1466 return self._python_agg_general(func, *args, **kwargs)
1467 elif args or kwargs:
1468 # test_pass_args_kwargs gets here (with and without as_index)
1469 # can't return early
1470 result = self._aggregate_frame(func, *args, **kwargs)
File ~\projects\Lib\site-packages\pandas\core\groupby\generic.py:1517, in DataFrameGroupBy._python_agg_general(self, func, *args, **kwargs)
1514 warn_alias_replacement(self, orig_func, alias)
1515 f = lambda x: func(x, *args, **kwargs)
-> 1517 if self.ngroups == 0:
1518 # e.g. test_evaluate_with_empty_groups different path gets different
1519 # result dtype in empty case.
1520 return self._python_apply_general(f, self._selected_obj, is_agg=True)
1522 obj = self._obj_with_exclusions
File ~\projects\Lib\site-packages\pandas\core\groupby\groupby.py:856, in BaseGroupBy.ngroups(self)
853 @final
854 @property
855 def ngroups(self) -> int:
--> 856 return self._grouper.ngroups
File properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File ~\projects\Lib\site-packages\pandas\core\groupby\ops.py:774, in BaseGrouper.ngroups(self)
771 @final
772 @cache_readonly
773 def ngroups(self) -> int:
--> 774 return len(self.result_index)
File properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File ~\projects\Lib\site-packages\pandas\core\groupby\ops.py:787, in BaseGrouper.result_index(self)
784 if len(self.groupings) == 1:
785 return self.groupings[0]._result_index.rename(self.names[0])
--> 787 codes = self.reconstructed_codes
788 levels = [ping._result_index for ping in self.groupings]
789 return MultiIndex(
790 levels=levels, codes=codes, verify_integrity=False, names=self.names
791 )
File ~\projects\Lib\site-packages\pandas\core\groupby\ops.py:778, in BaseGrouper.reconstructed_codes(self)
776 @property
777 def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
--> 778 codes = self.codes
779 ids, obs_ids, _ = self.group_info
780 return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
File ~\projects\Lib\site-packages\pandas\core\groupby\ops.py:690, in BaseGrouper.codes(self)
687 @final
688 @property
689 def codes(self) -> list[npt.NDArray[np.signedinteger]]:
--> 690 return [ping.codes for ping in self.groupings]
File ~\projects\Lib\site-packages\pandas\core\groupby\grouper.py:691, in Grouping.codes(self)
689 @property
690 def codes(self) -> npt.NDArray[np.signedinteger]:
--> 691 return self._codes_and_uniques[0]
File properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File ~\projects\Lib\site-packages\pandas\core\groupby\grouper.py:835, in Grouping._codes_and_uniques(self)
830 uniques = self._uniques
831 else:
832 # GH35667, replace dropna=False with use_na_sentinel=False
833 # error: Incompatible types in assignment (expression has type "Union[
834 # ndarray[Any, Any], Index]", variable has type "Categorical")
--> 835 codes, uniques = algorithms.factorize( # type: ignore[assignment]
836 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
837 )
838 return codes, uniques
File ~\projects\Lib\site-packages\pandas\core\algorithms.py:795, in factorize(values, sort, use_na_sentinel, size_hint)
792 # Don't modify (potentially user-provided) array
793 values = np.where(null_mask, na_value, values)
--> 795 codes, uniques = factorize_array(
796 values,
797 use_na_sentinel=use_na_sentinel,
798 size_hint=size_hint,
799 )
801 if sort and len(uniques) > 0:
802 uniques, codes = safe_sort(
803 uniques,
804 codes,
(...)
807 verify=False,
808 )
File ~\projects\Lib\site-packages\pandas\core\algorithms.py:595, in factorize_array(values, use_na_sentinel, size_hint, na_value, mask)
592 hash_klass, values = _get_hashtable_algo(values)
594 table = hash_klass(size_hint or len(values))
--> 595 uniques, codes = table.factorize(
596 values,
597 na_sentinel=-1,
598 na_value=na_value,
599 mask=mask,
600 ignore_na=use_na_sentinel,
601 )
603 # re-cast e.g. i8->dt64/td64, uint8->bool
604 uniques = _reconstruct_data(uniques, original.dtype, original)
KeyboardInterrupt:
We can see the profile above, which shows all the columns that are being used, as well as the range of values, alerts, covariance matrix, plots, heatmaps, etc.
There are around 445,000 rows. However the profile also shows us that our response variable has high imbalance of class, which we will need to take care for in the next steps, through undersampling, which will reduce our row count significantly
There are other variables that have issues, such as state having a high cardinality, missing data values, etc
The following code will perform basic cleanup tasks and make the data ready for use
'''
Below code performs a bunch of cleanup tasks that were determined from stufying the data.
Some of these columns, though useful for other prediction tasks (such as if a person has SkinCancer or not),
are not relevant for this prediction. Further, values such as Yes and No have been converted to True and false, as well as
numeric columns have been cleaned to maintain data type consistencies
We can compare the profiles at the end and see that this data is more suited for processing
'''
# Function to convert yes/no to boolean
def yes_no_to_bool(value):
if pd.isna(value):
return np.nan
return value.lower() == 'yes'
# Standardize columns
df['Sex'] = df['Sex'].astype('category')
df['AgeCategory'] = df['AgeCategory'].astype('category')
df['RaceEthnicityCategory'] = df['RaceEthnicityCategory'].astype('category')
df['HeightInMeters'] = pd.to_numeric(df['HeightInMeters'], errors='coerce')
df['WeightInKilograms'] = pd.to_numeric(df['WeightInKilograms'], errors='coerce')
df['BMI'] = pd.to_numeric(df['BMI'], errors='coerce')
df['GeneralHealth'] = df['GeneralHealth'].astype('category')
df['PhysicalHealthDays'] = pd.to_numeric(df['PhysicalHealthDays'], errors='coerce')
df['MentalHealthDays'] = pd.to_numeric(df['MentalHealthDays'], errors='coerce')
df['LastCheckupTime'] = df['LastCheckupTime'].astype('category')
df['SleepHours'] = pd.to_numeric(df['SleepHours'], errors='coerce')
df['SmokerStatus'] = df['SmokerStatus'].astype('category')
df['ECigaretteUsage'] = df['ECigaretteUsage'].astype('category')
had_columns = [col for col in df.columns if col.startswith('Had')]
for col in had_columns:
df[col] = df[col].apply(yes_no_to_bool)
binary_columns = [
"DeafOrHardOfHearing", "BlindOrVisionDifficulty", "DifficultyConcentrating",
"DifficultyWalking", "DifficultyDressingBathing", "DifficultyErrands",
"FluVaxLast12", "PneumoVaxEver", "HighRiskLastYear", "CovidPos", "PhysicalActivities",
"AlcoholDrinkers", "ChestScan", "HIVTesting"
]
for col in binary_columns:
df[col] = df[col].apply(yes_no_to_bool)
categorical_columns = ['ECigaretteUsage', 'LastCheckupTime', 'GeneralHealth',
'RaceEthnicityCategory', 'AgeCategory', 'SmokerStatus']
for column in categorical_columns:
df = df[df[column].notna() & (df[column] != '')]
numeric_columns = ['HeightInMeters', 'WeightInKilograms', 'BMI',
'PhysicalHealthDays', 'MentalHealthDays', 'SleepHours']
for column in numeric_columns:
df = df[df[column].notna()]
df = df.dropna()
df.to_csv(r"C:\Users\singh\Downloads\503 Project\cleaned_joined2.csv", index=False)
profile = ProfileReport(df, title="Pandas Profiling Report")
profile
'''
We need to perfrom Undersampling as there is a great class imbalance in our response variable.
The below code tries to fix that by taking only a subset of the False class
'''
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
df_undersampled = pd.DataFrame(X_resampled, columns=X.columns)
df_undersampled['HadHeartIssues'] = y_resampled
df = df_undersampled
df.to_csv(r"C:\Users\singh\Downloads\503 Project\cleaned_joined3.csv", index=False)
#We can check the profile again at the end to see if the data is now cleaned
profile = ProfileReport(df, title="Pandas Profiling Report")
profile
#Alerts that were ignored
# - 17 duplicate rows - Duplicacy arrives from deletion of some variables, and has only 17 rows, hence was ignored
import os
import mlflow
# Set tracking URI and credentials
MLFLOW_TRACKING_URI = "https://dagshub.com/singhvarunnn789/EAS503.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "singhvarunnn789"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "0b1c49603d59c6ba067650000962ccbb763351a5"
# Set the tracking URI
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
# Set experiment
mlflow.set_experiment("logistic_regression_experiment")
<Experiment: artifact_location='mlflow-artifacts:/86d8cd44ef014a1a85602108549a0d9c', creation_time=1734611106268, experiment_id='0', last_update_time=1734611106268, lifecycle_stage='active', name='logistic_regression_experiment', tags={}>
Splitting the data into test and train sets in a ratio of 2:8#
# Split the data
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Experiment 1#
Pipeline for Pre-processing#
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('minmax', MinMaxScaler()),
('log', FunctionTransformer(np.log1p, validate=False))
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
Create pipeline for Logistic Regression#
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
Parameter Hypertuning#
from sklearn.model_selection import GridSearchCV
param_grid = {
'classifier__C': [0.1, 1, 10],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['liblinear', 'saga']
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
10 - fold Cross validation#
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='f1')
Logging results in MLFlow#
import mlflow
from mlflow.models import infer_signature
import os
# Set MLflow tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/singhvarunnn789/EAS503.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'singhvarunnn789'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0b1c49603d59c6ba067650000962ccbb763351a5'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("logistic_regression_experiment")
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Log results with MLflow
with mlflow.start_run():
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("f1_score", f1)
mlflow.log_metric("true_negatives", tn)
mlflow.log_metric("false_positives", fp)
mlflow.log_metric("false_negatives", fn)
mlflow.log_metric("true_positives", tp)
mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
mlflow.log_metric("cv_f1_std", np.std(cv_scores))
# Log the model
signature = infer_signature(X_train, best_model.predict(X_train))
mlflow.sklearn.log_model(
sk_model=best_model,
artifact_path="logistic_regression_model",
signature=signature,
input_example=X_train.iloc[0].to_dict(),
registered_model_name="LogisticRegressionModel"
)
print(f"Best parameters: {grid_search.best_params_}")
print(f"F1-score on test set: {f1:.4f}")
print(f"Confusion matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
print(f"Cross-validation F1-scores - Mean: {np.mean(cv_scores):.4f}, Std: {np.std(cv_scores):.4f}")
Registered model 'LogisticRegressionModel' already exists. Creating a new version of this model...
2024/12/20 21:36:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionModel, version 26
Created version '26' of model 'LogisticRegressionModel'.
๐ View run invincible-sheep-666 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/0/runs/99a10fcc64d74094bb11e5d8f3de2544
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/0
Best parameters: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
F1-score on test set: 0.5963
Confusion matrix: TN=6511, FP=1630, FN=4696, TP=3418
Cross-validation F1-scores - Mean: 0.5144, Std: 0.0083
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
print(f"Best parameters: {grid_search.best_params_}")
print(f"F1-score on test set: {f1:.4f}")
print(f"Confusion matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
print(f"Cross-validation F1-scores - Mean: {np.mean(cv_scores):.4f}, Std: {np.std(cv_scores):.4f}")
experiment_results = {
"experiment_name": "Experiment 1: Logistic Regression",
"f1_score": f1,
"confusion_matrix": {
"TN": tn,
"FP": fp,
"FN": fn,
"TP": tp
},
"cv_f1_mean": np.mean(cv_scores),
"cv_f1_std": np.std(cv_scores)
}
all_experiment_results = []
all_experiment_results.append(experiment_results)
Best parameters: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
F1-score on test set: 0.5963
Confusion matrix: TN=6511, FP=1630, FN=4696, TP=3418
Cross-validation F1-scores - Mean: 0.5144, Std: 0.0083
Experiment 2#
Pipeline for pre-processing#
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
Pipeline for Logistic Regression, Ridge, RandomForest, and XGClassifier#
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# Create pipelines for each classifier
log_reg_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
ridge_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RidgeClassifier())
])
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
xgb_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])
Model Fitting for Logistic Regression, Ridge, RandomForest, and XGBClassfier#
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
# List of pipelines
pipelines = [
('LogisticRegression', log_reg_pipeline),
('RidgeClassifier', ridge_pipeline),
('RandomForestClassifier', rf_pipeline),
('XGBClassifier', xgb_pipeline)
]
for name, pipeline in pipelines:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"{name}:")
print(f" CV F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f" Test Accuracy: {accuracy:.4f}")
print(f" Test F1-score: {f1:.4f}")
LogisticRegression:
CV F1-score: 0.4999 (+/- 0.0175)
Test Accuracy: 0.6110
Test F1-score: 0.5933
RidgeClassifier:
CV F1-score: 0.4932 (+/- 0.0168)
Test Accuracy: 0.6098
Test F1-score: 0.5903
RandomForestClassifier:
CV F1-score: 0.5771 (+/- 0.0088)
Test Accuracy: 0.5843
Test F1-score: 0.5842
XGBClassifier:
CV F1-score: 0.5952 (+/- 0.0130)
Test Accuracy: 0.6226
Test F1-score: 0.6211
Logging results in MLFlow#
import mlflow
import os
# Set MLflow tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/singhvarunnn789/EAS503.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'singhvarunnn789'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0b1c49603d59c6ba067650000962ccbb763351a5'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("classifier_comparison")
# Log results with MLflow
for name, pipeline in pipelines:
with mlflow.start_run(run_name=name):
# Log the model
mlflow.sklearn.log_model(pipeline, f"{name}_model")
# Log metrics
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
mlflow.log_metric("cv_f1_mean", cv_scores.mean())
mlflow.log_metric("cv_f1_std", cv_scores.std())
mlflow.log_metric("test_accuracy", accuracy)
mlflow.log_metric("test_f1", f1)
# Log parameters
mlflow.log_params(pipeline.get_params())
2024/12/20 21:38:39 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐ View run LogisticRegression at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1/runs/b25071262a6a44c3860ac01c1ad229f1
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1
2024/12/20 21:38:50 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐ View run RidgeClassifier at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1/runs/5bc9552711a046c5896eb5e2bbe2e729
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1
2024/12/20 21:39:06 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐ View run RandomForestClassifier at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1/runs/24acfe08a0184783a140b545bec3da37
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1
2024/12/20 21:43:54 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐ View run XGBClassifier at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1/runs/2bcbb7fe0f5245d39ae8409524ee8dc8
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/1
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
experiment_results = []
for name, pipeline in pipelines:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
result = {
"experiment_name": f"Experiment 2: {name}",
"cv_f1_mean": cv_scores.mean(),
"cv_f1_std": cv_scores.std(),
"test_accuracy": accuracy,
"test_f1": f1
}
experiment_results.append(result)
# Add these results to the overall list of experiment results
all_experiment_results.extend(experiment_results)
Experiment 3#
Feature Engineering#
SMOTE was also used in this step as the classes were really unbalanced, which was causing bias and a low F1 score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.base import BaseEstimator, TransformerMixin
import mlflow
# Custom Transformer for Feature Engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
X['BMI_category'] = pd.cut(X['BMI'], bins=[0, 18.5, 25, 30, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
age_map = {'Age 18 to 24': 21, 'Age 25 to 29': 27, 'Age 30 to 34': 32, 'Age 35 to 39': 37,
'Age 40 to 44': 42, 'Age 45 to 49': 47, 'Age 50 to 54': 52, 'Age 55 to 59': 57,
'Age 60 to 64': 62, 'Age 65 to 69': 67, 'Age 70 to 74': 72, 'Age 75 to 79': 77,
'Age 80 or older': 85}
X['Age_BMI_interaction'] = X['AgeCategory'].map(age_map) * X['BMI']
X['HealthScore'] = 30 - (X['PhysicalHealthDays'] + X['MentalHealthDays'])
X['LifestyleScore'] = (
X['PhysicalActivities'].astype(int) +
((X['SleepHours'] >= 7) & (X['SleepHours'] <= 9)).astype(int) +
(~X['AlcoholDrinkers']).astype(int) +
(X['SmokerStatus'] == 'Never smoked').astype(int)
)
comorbidity_columns = ['HadAsthma', 'HadSkinCancer', 'HadCOPD',
'HadDepressiveDisorder', 'HadKidneyDisease',
'HadArthritis', 'HadDiabetes']
X['ComorbidityCount'] = X[comorbidity_columns].sum(axis=1)
X['CheckedWithinYear'] = (X['LastCheckupTime'] == 'Within past year (anytime less than 12 months ago)').astype(int)
return X
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
feature_engineer = FeatureEngineer()
X_engineered = feature_engineer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_engineered, y, test_size=0.2, random_state=42)
Creating pipelines for preprocessing#
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
# Create preprocessing steps for numeric and categorical features
numeric_transformer = ImbPipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = ImbPipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
Creating pipelines for Classification algorithms, similar to experiment 2#
pipelines = [
('LogisticRegression', ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', LogisticRegression(random_state=42))
])),
('RidgeClassifier', ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', RidgeClassifier(random_state=42))
])),
('RandomForestClassifier', ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])),
('XGBClassifier', ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', XGBClassifier(random_state=42))
]))
]
Logging results in MLFlow#
best_model = None
best_f1 = 0
# Set up MLflow for tracking experiments
mlflow.set_tracking_uri("https://dagshub.com/singhvarunnn789/EAS503.mlflow")
mlflow.set_experiment("Feature_Engineering")
with mlflow.start_run():
for name, pipeline in pipelines:
with mlflow.start_run(nested=True):
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mlflow.log_param("model", name)
if hasattr(pipeline.named_steps['classifier'], "n_estimators"):
mlflow.log_param("n_estimators", pipeline.named_steps['classifier'].n_estimators)
mlflow.log_metric("cv_f1_score", cv_scores.mean())
mlflow.log_metric("test_accuracy", accuracy)
mlflow.log_metric("test_f1_score", f1)
print(f"{name}:")
print(f" CV F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f" Test Accuracy: {accuracy:.4f}")
print(f" Test F1-score: {f1:.4f}")
if f1 > best_f1:
best_f1 = f1
best_model = pipeline
print(f"\nBest model: {type(best_model.named_steps['classifier']).__name__}")
print(f"Best F1-score: {best_f1:.4f}")
LogisticRegression:
CV F1-score: 0.7570 (+/- 0.0087)
Test Accuracy: 0.7533
Test F1-score: 0.7594
๐ View run merciful-mouse-169 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2/runs/d7e1b4d12cbc451f8729998600e55705
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2
RidgeClassifier:
CV F1-score: 0.7578 (+/- 0.0115)
Test Accuracy: 0.7535
Test F1-score: 0.7613
๐ View run abrasive-skink-344 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2/runs/33ac317e0d6b4c579e09d8e110a8530b
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2
RandomForestClassifier:
CV F1-score: 0.7414 (+/- 0.0086)
Test Accuracy: 0.7387
Test F1-score: 0.7453
๐ View run indecisive-chimp-694 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2/runs/546276ec88374c00b14d5ac582441a36
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2
XGBClassifier:
CV F1-score: 0.7539 (+/- 0.0085)
Test Accuracy: 0.7496
Test F1-score: 0.7581
๐ View run smiling-rat-896 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2/runs/d8185e24a55c4dd19f74b5104201f987
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2
๐ View run big-hog-230 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2/runs/3135f7e9a6454b2daabdf0166acabcc9
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/2
Best model: RidgeClassifier
Best F1-score: 0.7613
Saving The Ridge Model as the best model#
Ridge Model has the highest F1 scores as well as interpretibility, hence this was selected as the best model, and used for the FastAPI Application
# After feature engineering
print("Columns after feature engineering:")
print(X_engineered.columns.tolist())
# Update numeric and categorical features
numeric_features = X_engineered.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_engineered.select_dtypes(include=['object', 'category']).columns.tolist()
# Recreate the preprocessor with updated feature lists
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Recreate the RidgeClassifier pipeline
ridge_pipeline = ImbPipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('classifier', RidgeClassifier(random_state=42))
])
ridge_pipeline.fit(X_engineered, y)
import joblib
joblib.dump(ridge_pipeline, 'best_model_ridge.joblib')
sample_data = X_engineered.iloc[0].to_dict()
sample_df = pd.DataFrame([sample_data])
prediction = ridge_pipeline.predict(sample_df)
Columns after feature engineering:
['Sex', 'AgeCategory', 'RaceEthnicityCategory', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities', 'SleepHours', 'AlcoholDrinkers', 'SmokerStatus', 'ECigaretteUsage', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear', 'CovidPos', 'BMI_category', 'Age_BMI_interaction', 'HealthScore', 'LifestyleScore', 'ComorbidityCount', 'CheckedWithinYear']
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
experiment_results = []
for name, pipeline in pipelines:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
result = {
"experiment_name": f"Experiment 3: {name} with Feature Engineering",
"cv_f1_mean": cv_scores.mean(),
"cv_f1_std": cv_scores.std(),
"test_accuracy": accuracy,
"test_f1": f1
}
experiment_results.append(result)
# Add these results to the overall list of experiment results
all_experiment_results.extend(experiment_results)
# Store the best model information
best_model_result = {
"experiment_name": f"Experiment 3: Best Model - {type(best_model.named_steps['classifier']).__name__}",
"test_f1": best_f1,
}
all_experiment_results.append(best_model_result)
Experiment 4#
Feature Selection
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
import mlflow
class FeatureEngineer:
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
X['BMI_category'] = pd.cut(X['BMI'], bins=[0, 18.5, 25, 30, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
age_map = {'Age 18 to 24': 21, 'Age 25 to 29': 27, 'Age 30 to 34': 32, 'Age 35 to 39': 37,
'Age 40 to 44': 42, 'Age 45 to 49': 47, 'Age 50 to 54': 52, 'Age 55 to 59': 57,
'Age 60 to 64': 62, 'Age 65 to 69': 67, 'Age 70 to 74': 72, 'Age 75 to 79': 77,
'Age 80 or older': 85}
X['Age_BMI_interaction'] = X['AgeCategory'].map(age_map) * X['BMI']
X['HealthScore'] = 30 - (X['PhysicalHealthDays'] + X['MentalHealthDays'])
X['LifestyleScore'] = (
X['PhysicalActivities'].astype(int) +
((X['SleepHours'] >= 7) & (X['SleepHours'] <= 9)).astype(int) +
(~X['AlcoholDrinkers']).astype(int) +
(X['SmokerStatus'] == 'Never smoked').astype(int)
)
comorbidity_columns = ['HadAsthma', 'HadSkinCancer', 'HadCOPD',
'HadDepressiveDisorder', 'HadKidneyDisease',
'HadArthritis', 'HadDiabetes']
X['ComorbidityCount'] = X[comorbidity_columns].sum(axis=1)
X['CheckedWithinYear'] = (X['LastCheckupTime'] == 'Within past year (anytime less than 12 months ago)').astype(int)
return X
def correlation_threshold(X, threshold=0.8):
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = X[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
return X.drop(columns=to_drop)
def variance_threshold(X, threshold=0.01):
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
selector = VarianceThreshold(threshold)
X_numeric = X[numeric_cols]
selector.fit(X_numeric)
selected_numeric_cols = numeric_cols[selector.get_support()].tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
selected_cols = selected_numeric_cols + categorical_cols.tolist()
return X[selected_cols]
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
feature_engineer = FeatureEngineer()
X_engineered = feature_engineer.transform(X)
print("Starting feature selection...")
X_corr = correlation_threshold(X_engineered)
X_selected = variance_threshold(X_corr)
print("Feature selection completed.")
if 'Sex' not in X_selected.columns:
X_selected['Sex'] = X['Sex']
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
Starting feature selection...
Feature selection completed.
Pipelines for pre-processing#
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
Pipelines for classification algorithms#
classifiers = [
('LogisticRegression', LogisticRegression(random_state=42)),
('RidgeClassifier', RidgeClassifier(random_state=42)),
('RandomForestClassifier', RandomForestClassifier(class_weight='balanced', random_state=42)),
('XGBClassifier', XGBClassifier(random_state=42))
]
Logging Results in MLFlow#
mlflow.set_tracking_uri("https://dagshub.com/singhvarunnn789/EAS503.mlflow")
mlflow.set_experiment("Feature_Selection")
with mlflow.start_run():
mlflow.log_param("n_features_original", X.shape[1])
mlflow.log_param("n_features_final", X_selected.shape[1])
for name, classifier in classifiers:
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', classifier)
])
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mlflow.log_metric(f"{name}_cv_f1_score", cv_scores.mean())
mlflow.log_metric(f"{name}_test_accuracy", accuracy)
mlflow.log_metric(f"{name}_test_f1_score", f1)
print(f"{name}:")
print(f" CV F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f" Test Accuracy: {accuracy:.4f}")
print(f" Test F1-score: {f1:.4f}")
print("Experiment completed.")
LogisticRegression:
CV F1-score: 0.7570 (+/- 0.0089)
Test Accuracy: 0.7534
Test F1-score: 0.7595
RidgeClassifier:
CV F1-score: 0.7579 (+/- 0.0108)
Test Accuracy: 0.7539
Test F1-score: 0.7618
RandomForestClassifier:
CV F1-score: 0.7429 (+/- 0.0077)
Test Accuracy: 0.7419
Test F1-score: 0.7486
XGBClassifier:
CV F1-score: 0.7536 (+/- 0.0097)
Test Accuracy: 0.7526
Test F1-score: 0.7614
๐ View run polite-shrimp-669 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/3/runs/72b2e870ad31461d866e93f69dd65e49
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/3
Experiment completed.
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
experiment_results = []
for name, classifier in classifiers:
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', classifier)
])
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
result = {
"experiment_name": f"Experiment 4: {name} with Feature Selection",
"cv_f1_mean": cv_scores.mean(),
"cv_f1_std": cv_scores.std(),
"test_accuracy": accuracy,
"test_f1": f1
}
experiment_results.append(result)
# Add these results to the overall list of experiment results
all_experiment_results.extend(experiment_results)
print(all_experiment_results)
[{'experiment_name': 'Experiment 1: Logistic Regression', 'f1_score': 0.5963351709658641, 'confusion_matrix': {'TN': 6511, 'FP': 1630, 'FN': 4696, 'TP': 3418}, 'cv_f1_mean': 0.5143968784013151, 'cv_f1_std': 0.008349098015545535}, {'experiment_name': 'Experiment 2: LogisticRegression', 'cv_f1_mean': 0.4999428588564096, 'cv_f1_std': 0.00874662016728795, 'test_accuracy': 0.6109504767763765, 'test_f1': 0.5932591979685936}, {'experiment_name': 'Experiment 2: RidgeClassifier', 'cv_f1_mean': 0.49318745745559694, 'cv_f1_std': 0.008377266578019512, 'test_accuracy': 0.609781605659797, 'test_f1': 0.5903170171463006}, {'experiment_name': 'Experiment 2: RandomForestClassifier', 'cv_f1_mean': 0.5768916081156558, 'cv_f1_std': 0.004431388609656797, 'test_accuracy': 0.5843125192248539, 'test_f1': 0.5842262734049917}, {'experiment_name': 'Experiment 2: XGBClassifier', 'cv_f1_mean': 0.5951801721677257, 'cv_f1_std': 0.006505189170044682, 'test_accuracy': 0.6225776684097201, 'test_f1': 0.6210547722910602}, {'experiment_name': 'Experiment 3: LogisticRegression with Feature Engineering', 'cv_f1_mean': 0.7569901748356858, 'cv_f1_std': 0.004334048344789438, 'test_accuracy': 0.7533066748692709, 'test_f1': 0.7594192464602831}, {'experiment_name': 'Experiment 3: RidgeClassifier with Feature Engineering', 'cv_f1_mean': 0.7577506298886091, 'cv_f1_std': 0.005742904962691456, 'test_accuracy': 0.7534912334666256, 'test_f1': 0.7613318244088392}, {'experiment_name': 'Experiment 3: RandomForestClassifier with Feature Engineering', 'cv_f1_mean': 0.7414216941069476, 'cv_f1_std': 0.0043142351185395155, 'test_accuracy': 0.7387265456782528, 'test_f1': 0.7453378905078851}, {'experiment_name': 'Experiment 3: XGBClassifier with Feature Engineering', 'cv_f1_mean': 0.7538860749081145, 'cv_f1_std': 0.004259417111887184, 'test_accuracy': 0.7496155029221778, 'test_f1': 0.758054928070384}, {'experiment_name': 'Experiment 3: Best Model - RidgeClassifier', 'test_f1': 0.7613318244088392}, {'experiment_name': 'Experiment 4: LogisticRegression with Feature Selection', 'cv_f1_mean': 0.7570201140509777, 'cv_f1_std': 0.004455890177913308, 'test_accuracy': 0.7534297139341741, 'test_f1': 0.7595392368610511}, {'experiment_name': 'Experiment 4: RidgeClassifier with Feature Selection', 'cv_f1_mean': 0.7578802646702998, 'cv_f1_std': 0.0053923952704475735, 'test_accuracy': 0.7539218701937865, 'test_f1': 0.7618196975110159}, {'experiment_name': 'Experiment 4: RandomForestClassifier with Feature Selection', 'cv_f1_mean': 0.7428773797994953, 'cv_f1_std': 0.003861000917223668, 'test_accuracy': 0.741864041833282, 'test_f1': 0.7485919712402637}, {'experiment_name': 'Experiment 4: XGBClassifier with Feature Selection', 'cv_f1_mean': 0.753621048443774, 'cv_f1_std': 0.0048482811260100326, 'test_accuracy': 0.7525684404798524, 'test_f1': 0.7613622878841818}]
Experiment 5#
Pipeline for Pre-processing#
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import mlflow
import matplotlib.pyplot as plt
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
X_preprocessed = preprocessor.fit_transform(X)
Setting up PCA and Classification algorithms#
pca = PCA()
X_pca = pca.fit_transform(X_preprocessed)
# Create scree plot
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = len(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_components + 1), cumulative_variance_ratio, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Scree Plot')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.savefig('scree_plot.png')
plt.show()
plt.close()
# Select number of components that explain 95% of variance
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
pca_95 = PCA(n_components=n_components_95)
X_pca_95 = pca_95.fit_transform(X_preprocessed)
X_train, X_test, y_train, y_test = train_test_split(X_pca_95, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
classifiers = [
('LogisticRegression', LogisticRegression(random_state=42)),
('RidgeClassifier', RidgeClassifier(random_state=42)),
('RandomForestClassifier', RandomForestClassifier(class_weight='balanced', random_state=42)),
('XGBClassifier', XGBClassifier(random_state=42))
]
Logging Results in MLFlow#
mlflow.set_tracking_uri("https://dagshub.com/singhvarunnn789/EAS503.mlflow")
mlflow.set_experiment("PCA_Dimensionality_Reduction")
with mlflow.start_run():
mlflow.log_param("n_features_original", X.shape[1])
mlflow.log_param("n_components_95", n_components_95)
mlflow.log_artifact("scree_plot.png")
for name, classifier in classifiers:
clf = classifier.fit(X_train_resampled, y_train_resampled)
cv_scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mlflow.log_metric(f"{name}_cv_f1_score", cv_scores.mean())
mlflow.log_metric(f"{name}_test_accuracy", accuracy)
mlflow.log_metric(f"{name}_test_f1_score", f1)
print(f"{name}:")
print(f" CV F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f" Test Accuracy: {accuracy:.4f}")
print(f" Test F1-score: {f1:.4f}")
print("Experiment completed.")
LogisticRegression:
CV F1-score: 0.7347 (+/- 0.0070)
Test Accuracy: 0.7369
Test F1-score: 0.7374
RidgeClassifier:
CV F1-score: 0.7328 (+/- 0.0081)
Test Accuracy: 0.7360
Test F1-score: 0.7364
RandomForestClassifier:
CV F1-score: 0.7320 (+/- 0.0109)
Test Accuracy: 0.7289
Test F1-score: 0.7352
XGBClassifier:
CV F1-score: 0.7406 (+/- 0.0062)
Test Accuracy: 0.7373
Test F1-score: 0.7466
๐ View run unleashed-hare-695 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/4/runs/081f8449da414a0f938cbd3df0377077
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/4
Experiment completed.
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
# Store results for each classifier
experiment_results = []
for name, classifier in classifiers:
clf = classifier.fit(X_train_resampled, y_train_resampled)
cv_scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
result = {
"experiment_name": f"Experiment 5: {name} with PCA",
"cv_f1_mean": cv_scores.mean(),
"cv_f1_std": cv_scores.std(),
"test_accuracy": accuracy,
"test_f1": f1,
}
experiment_results.append(result)
# Add these results to the overall list of experiment results
all_experiment_results.extend(experiment_results)
print("Results stored for Experiment 5.")
Results stored for Experiment 5.
Custom Experiment 1#
This experiment uses a pipeline with preprocessing, feature selection, and a Random Forest classifier.
It employs grid search for hyperparameter tuning and Gradient Boosting for feature selection.
The pipeline includes StandardScaler for numeric features and OneHotEncoder for categorical features.
The experiment uses MLflow to track parameters, metrics, and the best model.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import mlflow
# Load your dataset
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
# Create preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# Create a pipeline with preprocessing, feature selection, and classifier
pipeline = Pipeline([
('preprocessor', preprocessor),
('feature_selection', SelectFromModel(GradientBoostingClassifier(random_state=42))),
('classifier', RandomForestClassifier(random_state=42))
])
# Define hyperparameters to tune
param_grid = {
'feature_selection__estimator__n_estimators': [100, 200],
'feature_selection__estimator__max_depth': [3, 5],
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [5, 10],
'classifier__min_samples_split': [2, 5]
}
# Set up MLflow for tracking experiments
mlflow.set_tracking_uri("https://dagshub.com/singhvarunnn789/EAS503.mlflow")
mlflow.set_experiment("Custom_Experiment_6")
with mlflow.start_run():
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Get the best model
best_model = grid_search.best_estimator_
# Make predictions
y_pred = best_model.predict(X_test)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Log parameters and metrics
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1_score", f1)
# Log the model
mlflow.sklearn.log_model(best_model, "best_model")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print("Custom experiment completed.")
2024/12/20 23:32:40 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200, 'feature_selection__estimator__max_depth': 3, 'feature_selection__estimator__n_estimators': 100}
Accuracy: 0.7347
F1-score: 0.7440
๐ View run dashing-croc-199 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/5/runs/b36b25e0efd54f949d930c55838c0508
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/5
Custom experiment completed.
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
# Store results for the custom experiment
experiment_result = {
"experiment_name": "Experiment 6: Custom Pipeline with Feature Selection",
"test_accuracy": accuracy,
"test_f1": f1,
"cv_f1_mean": grid_search.best_score_,
"cv_f1_std": grid_search.cv_results_['std_test_score'][grid_search.best_index_],
}
all_experiment_results.append(experiment_result)
print("Results stored for Experiment 6.")
Results stored for Experiment 6.
Custom Experiment 2#
This experiment focuses on ensemble methods and advanced feature engineering.
It uses a stacking classifier combining Random Forest and Gradient Boosting, with Logistic Regression as the final estimator.
The pipeline includes preprocessing and Bayesian optimization for hyperparameter tuning.
The experiment logs results in MLflow.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from skopt import BayesSearchCV
import mlflow
def create_interaction_features(X):
age_map = {'Age 18 to 24': 21, 'Age 25 to 29': 27, 'Age 30 to 34': 32, 'Age 35 to 39': 37,
'Age 40 to 44': 42, 'Age 45 to 49': 47, 'Age 50 to 54': 52, 'Age 55 to 59': 57,
'Age 60 to 64': 62, 'Age 65 to 69': 67, 'Age 70 to 74': 72, 'Age 75 to 79': 77,
'Age 80 or older': 85}
X['Age'] = X['AgeCategory'].map(age_map)
X['BMI_Age'] = X['BMI'] * X['Age']
X['ComorbidityScore'] = X[['HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
'HadKidneyDisease', 'HadArthritis', 'HadDiabetes']].sum(axis=1)
return X
X = df.drop('HadHeartIssues', axis=1)
y = df['HadHeartIssues']
X = create_interaction_features(X)
# Use a smaller subset of data for faster execution
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
base_estimators = [
('rf', RandomForestClassifier(random_state=42)),
('gb', GradientBoostingClassifier(random_state=42))
]
stacking_classifier = StackingClassifier(
estimators=base_estimators,
final_estimator=LogisticRegression(random_state=42),
cv=3
)
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', stacking_classifier)
])
param_space = {
'classifier__rf__n_estimators': (50, 200),
'classifier__gb__n_estimators': (50, 200),
'classifier__final_estimator__C': (1e-3, 1e3, 'log-uniform')
}
bayes_search = BayesSearchCV(
pipeline, param_space, n_iter=20, cv=3, scoring='f1', random_state=42, n_jobs=-1
)
mlflow.set_tracking_uri("https://dagshub.com/singhvarunnn789/EAS503.mlflow")
mlflow.set_experiment("Custom_Experiment_7")
with mlflow.start_run():
bayes_search.fit(X_train, y_train)
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mlflow.log_params(bayes_search.best_params_)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1_score", f1)
mlflow.sklearn.log_model(best_model, "best_model")
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print("Custom experiment completed.")
2024/12/21 00:05:28 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Best parameters: OrderedDict({'classifier__final_estimator__C': 1000.0, 'classifier__gb__n_estimators': 50, 'classifier__rf__n_estimators': 155})
Accuracy: 0.7468
F1-score: 0.7573
๐ View run languid-bass-97 at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/6/runs/0761b33399474c75ab09b9fcbdf661bd
๐งช View experiment at: https://dagshub.com/singhvarunnn789/EAS503.mlflow/#/experiments/6
Custom experiment completed.
Adding the results of Experiments as a dictionary to a list so we can plot the F1 scores at the end#
# Store results for the custom experiment
experiment_result = {
"experiment_name": "Experiment 7: Stacking Classifier with Bayesian Optimization",
"test_accuracy": accuracy,
"test_f1": f1,
"cv_f1_mean": bayes_search.best_score_,
}
# Add this result to the overall list of experiment results
all_experiment_results.append(experiment_result)
print("Results stored for Experiment 7.")
Results stored for Experiment 7.
Creating F1 Score Plots#
import matplotlib.pyplot as plt
%matplotlib inline
# Extract experiment names, F1 scores, and accuracy scores, handling missing keys
def get_metric(result, key):
return result.get(key, None)
experiment_names = [result['experiment_name'] for result in all_experiment_results]
f1_scores = [get_metric(result, 'test_f1') or get_metric(result, 'f1_score') for result in all_experiment_results]
accuracy_scores = [get_metric(result, 'test_accuracy') for result in all_experiment_results]
# Filter out None values for plotting
valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 is not None and accuracy_scores[i] is not None]
experiment_names = [experiment_names[i] for i in valid_indices]
f1_scores = [f1_scores[i] for i in valid_indices]
accuracy_scores = [accuracy_scores[i] for i in valid_indices]
# Create the bar plot
plt.figure(figsize=(14, 7))
x = range(len(experiment_names))
plt.bar(x, f1_scores, width=0.4, label='F1 Score', color='blue', align='center')
plt.bar(x, accuracy_scores, width=0.4, label='Accuracy', color='orange', align='edge')
# Add labels and title
plt.xticks(x, experiment_names, rotation=45, ha='right')
plt.xlabel('Experiment')
plt.ylabel('Score')
plt.title('F1 Scores and Accuracy for Each Experiment')
plt.legend()
<matplotlib.legend.Legend at 0x18123d5fe00>